import pandas as pd  
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder  
from sklearn.cluster import KMeans  
import pickle
from decision_company import read_csv_file, col_copy, create_standard_scaler, fit_transform_standard_scaler, create_label_encoder, fit_transform_label_encoder, get_dummies, create_kmeans, fit_predict_kmeans, fetch_column, col_assign_val, avg, series_value_counts, series_to_dict

  
# Load the dataset  
credit_customers = read_csv_file("credit_customers.csv")  
  
# Extract the important columns  
important_columns = ['credit_history', 'age', 'employment', 'credit_amount', 'savings_status']  
data_for_clustering = col_copy(credit_customers, important_columns)  
  
# Preprocess the data by applying Label Encoding to 'savings_status' and 'employment'  
col_assign_val(data_for_clustering, 'savings_status', fit_transform_label_encoder(create_label_encoder(), data_for_clustering['savings_status']) ) 
col_assign_val(data_for_clustering, 'employment', fit_transform_label_encoder(create_label_encoder(), data_for_clustering['employment'])) 
  
# Apply One-Hot Encoding to 'credit_history'  
data_for_clustering = get_dummies(data_for_clustering, columns=['credit_history'], drop_first=True)  
  
# Normalize the data using Standard Scaling  
data_for_clustering_scaled = fit_transform_standard_scaler(create_standard_scaler(), data_for_clustering)  
  
# Perform K-means clustering with 4 clusters  
kmeans = create_kmeans(n_clusters=4, random_state=42)  
cluster_labels = fit_predict_kmeans(kmeans, data_for_clustering_scaled)  
  
# Add the cluster labels to the original dataset  
col_assign_val(credit_customers, 'cluster', cluster_labels)  
  
# Define the target customer segments  
target_customer_segments = [1, 2]  # Replace this list with the target customer segments from the previous step  
  
# Summarize the key characteristics of each target customer segment  
summary = {}  
for segment in target_customer_segments:  
    segment_customers = credit_customers[credit_customers['cluster'] == segment]  
    summary[segment] = {  
        'average_age': avg(fetch_column(segment_customers, 'age')),  
        'average_credit_amount': avg(fetch_column(segment_customers, 'credit_amount')),  
        'employment_distribution': series_to_dict(series_value_counts(fetch_column(segment_customers, 'employment'), normalize=True)),  
        'savings_status_distribution': series_to_dict(series_value_counts(fetch_column(segment_customers, 'savings_status'), normalize=True)),  
        'credit_history_distribution': series_to_dict(series_value_counts(fetch_column(segment_customers, 'credit_history'), normalize=True))  
    }  
  
# Return the summary of key characteristics for each target customer segment  
print("summary:\n", summary)  
pickle.dump(summary, open("./ref_result/summary.pkl", "wb"))  
